Clasificador - hito 2¶

In [ ]:
import pandas as pd
import numpy as np
import pickle

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})

file_names = {
    "df_es_mapping": "../../Data/mapping/df_es_mapping.pickle",
    "df_us_mapping": "../../Data/mapping/df_us_mapping.pickle",
    
    "df_es_test": "../../Data/test/df_es_test.pickle",
    "df_us_test": "../../Data/test/df_us_test.pickle",
    
    "df_es_train": "../../Data/train/df_es_train.pickle",
    "df_us_train": "../../Data/train/df_us_train.pickle",
    
    "df_es_trial": "../../Data/trial/df_es_trial.pickle",
    "df_us_trial": "../../Data/trial/df_us_trial.pickle",
}

# mas imports
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()

cargar sets

In [ ]:
df_es_train = pickle.load(open(file_names["df_es_train"], "rb"))
df_es_trial = pickle.load(open(file_names["df_es_trial"], "rb"))
df_es_test = pickle.load(open(file_names["df_es_test"], "rb"))

df_us_train = pickle.load(open(file_names["df_us_train"], "rb"))
df_us_trial = pickle.load(open(file_names["df_us_trial"], "rb"))
df_us_test = pickle.load(open(file_names["df_us_test"], "rb"))

pre-procesamiento

In [ ]:
%%time
df_us_train['tokenized_text'] = df_us_train['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
df_us_train.head()
CPU times: user 21.3 s, sys: 57.3 ms, total: 21.4 s
Wall time: 21.4 s
Out[ ]:
id text label tokenized_text
0 729044324441186304 Selfies for summatime @ Drexel University 12 selfies for summatime @ drexel university
1 663834134037442560 Ready to be a bulldog with rasso #hailstate #i... 14 ready to be a bulldog with rasso #hailstate #i...
2 747449193350963200 #scored my new #matcotools #slidehammer weight... 16 #scored my new #matcotools #slidehammer weight...
3 691439672761925637 @user last night was so much fun @ Skyway Thea... 6 @user last night was so much fun @ skyway theatre
4 758118895618109440 love beach days @ Manasquan Beach 12 love beach days @ manasquan beach
In [ ]:
df_us_test['tokenized_text'] = df_us_test['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=5)
X_train_bow = vectorizer.fit_transform(df_us_train["tokenized_text"])
X_test_bow = vectorizer.transform(df_us_test["tokenized_text"])
In [ ]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train_bow, df_us_train["label"]);
In [ ]:
clf.score(X_train_bow, df_us_train["label"])
Out[ ]:
0.39701310639001064
In [ ]:
from sklearn.metrics import classification_report

df_us_mapping = pickle.load(open(file_names["df_us_mapping"], "rb")).sort_values("label")

y_pred = clf.predict(X_test_bow)
print(classification_report(df_us_test["label"], y_pred, target_names=df_us_mapping["emoji"]))
              precision    recall  f1-score   support

           ❤       0.35      0.58      0.44     10798
           😍       0.25      0.25      0.25      4830
           📷       0.16      0.16      0.16      1432
          🇺🇸       0.47      0.50      0.48      1949
           ☀       0.25      0.43      0.32      1265
           💜       0.32      0.05      0.08      1114
           😉       0.12      0.04      0.06      1306
           💯       0.27      0.14      0.19      1244
           😁       0.14      0.03      0.05      1153
           🎄       0.60      0.60      0.60      1545
           📸       0.29      0.10      0.15      2417
           😜       0.04      0.01      0.01      1010
           😂       0.30      0.52      0.38      4534
           💕       0.19      0.05      0.08      2605
           🔥       0.45      0.47      0.46      3716
           😊       0.09      0.06      0.07      1613
           😎       0.16      0.11      0.13      1996
           ✨       0.29      0.18      0.22      2749
           💙       0.22      0.07      0.10      1549
           😘       0.16      0.05      0.08      1175

    accuracy                           0.32     50000
   macro avg       0.26      0.22      0.22     50000
weighted avg       0.29      0.32      0.28     50000

In [ ]:
vocab = {k: v for v, k in enumerate(vectorizer.get_feature_names_out())}

vec_test = np.zeros(X_train_bow.shape[1])
k = vocab["santa"]
vec_test[k] = 1
print(vectorizer.inverse_transform([vec_test])[0][0])
clf.predict_proba([vec_test])
santa
Out[ ]:
array([[0.21267139, 0.10559105, 0.03170409, 0.02263593, 0.06533366,
        0.01011697, 0.02992065, 0.00932074, 0.02055197, 0.11856118,
        0.02276163, 0.02159743, 0.10988494, 0.03433423, 0.01711626,
        0.03922043, 0.05487754, 0.02576475, 0.02707579, 0.02095938]])

GridSearch

In [ ]:
%%capture

from sklearn.metrics import f1_score, accuracy_score

dfs = [1,2,3,4,5,6,7,8,9,10]
alphas = [0, 0.2, 0.4, 0.6, 0.8, 1]

f1_m = {}
f1_w = {}
scores={}

for j in alphas:
    f1_m[j] = {}   
    f1_w[j] = {}
    scores[j] = {}
    for i in dfs:
        #Vectorización 
        vectorizer = CountVectorizer(min_df=i) #Definimos la cantidad de veces que se repite una palabra para que el clasificador la tome en consideración.
        X_train_bow = vectorizer.fit_transform(df_us_train["tokenized_text"])
        X_test_bow = vectorizer.transform(df_us_test["tokenized_text"])

        #Obtenemos el clf score para el clasificador
        clf = MultinomialNB(alpha=j)
        clf.fit(X_train_bow, df_us_train["label"])
        clf.score(X_train_bow, df_us_train["label"])

        y_pred = clf.predict(X_test_bow)
        y_true = df_us_test["label"]
        
        # Se llenan los diccionarios
        f1_m[j][i] = f1_score(y_true, y_pred, average="macro")
        f1_w[j][i] = f1_score(y_true, y_pred, average="weighted")   
        scores[j][i] = accuracy_score(y_true, y_pred)
In [ ]:
scores_array = []
for i in scores:

    data = np.array(list(scores[i].values()) ).reshape(len(dfs),)
    scores_array.append(data)
In [ ]:
x_axis_labels = []

plt.title("Accuracy del clasificador para distintos parámetros")
sns.heatmap(scores_array, annot=True, xticklabels=dfs, yticklabels=alphas)
plt.xlabel('minimas ocurrencias para token')
plt.ylabel('alpha');
In [ ]:
f1_m_array = []

for i in f1_m:

    data = np.array(list(f1_m[i].values()) ).reshape(len(dfs),) # transform to 2D
    f1_m_array.append(data)

f1_m_array

plt.title("Macro-f1 del clasificador para distintos parámetros")
sns.heatmap(f1_m_array, annot=True, xticklabels=dfs, yticklabels=alphas)
plt.xlabel('minimas ocurrencias para token')
plt.ylabel('alpha');
In [ ]:
f1_w_array = []

for i in f1_w:
    data = np.array(list(f1_w[i].values()) ).reshape(len(dfs),) # transform to 2D
    f1_w_array.append(data)

f1_w_array

plt.title("Weighted-f1 del clasificador para distintos parámetros")
sns.heatmap(f1_w_array, annot=True, xticklabels=dfs, yticklabels=alphas)
plt.xlabel('minimas ocurrencias para token')
plt.ylabel('alpha');

Finalmente, como resulado de este grid-search, escogemos el clasificador con el mejor macro-f1.

In [ ]:
best_alpha, best_min_df = np.unravel_index(np.argmax(np.array(f1_m_array)),shape=(len(alphas),len(dfs)))

scr, macf1, weif1 = scores_array[best_alpha][best_min_df], f1_m_array[best_alpha][best_min_df], f1_w_array[best_alpha][best_min_df]
best_alpha, best_min_df = alphas[best_alpha], dfs[best_min_df]

print("Parámetros escogidos:\n\talpha = {}\n\tminimas ocurrencias para token = {}".format(best_alpha,best_min_df))
print("Resultados de clasificación:\n\taccuracy = {}\n\tmacro f1 = {}\n\tweighted f1 = {}".format(scr,macf1,weif1))
Parámetros escogidos:
	alpha = 0.2
	minimas ocurrencias para token = 10
Resultados de clasificación:
	accuracy = 0.30642
	macro f1 = 0.22695113269450196
	weighted f1 = 0.2876509870746095
In [ ]:
vectorizer = CountVectorizer(min_df=best_min_df)
X_train_bow = vectorizer.fit_transform(df_us_train["tokenized_text"])
X_test_bow = vectorizer.transform(df_us_test["tokenized_text"])

clf = MultinomialNB(alpha=best_alpha)
clf.fit(X_train_bow, df_us_train["label"])

y_pred = clf.predict(X_test_bow)
print(classification_report(df_us_test["label"], y_pred, target_names=df_us_mapping["emoji"]))
              precision    recall  f1-score   support

           ❤       0.38      0.48      0.42     10798
           😍       0.26      0.24      0.25      4830
           📷       0.14      0.17      0.16      1432
          🇺🇸       0.42      0.52      0.46      1949
           ☀       0.23      0.49      0.31      1265
           💜       0.23      0.06      0.10      1114
           😉       0.10      0.08      0.09      1306
           💯       0.21      0.20      0.20      1244
           😁       0.10      0.06      0.07      1153
           🎄       0.57      0.64      0.60      1545
           📸       0.26      0.15      0.19      2417
           😜       0.07      0.03      0.04      1010
           😂       0.33      0.47      0.39      4534
           💕       0.18      0.08      0.11      2605
           🔥       0.45      0.45      0.45      3716
           😊       0.09      0.07      0.08      1613
           😎       0.15      0.12      0.13      1996
           ✨       0.27      0.21      0.23      2749
           💙       0.19      0.10      0.13      1549
           😘       0.13      0.10      0.11      1175

    accuracy                           0.31     50000
   macro avg       0.24      0.24      0.23     50000
weighted avg       0.28      0.31      0.29     50000

Top palabras por emoji¶

In [ ]:
%%time
vocab_length = X_train_bow.shape[1]
proba_matrix = np.array([clf.predict_proba(np.eye(1,vocab_length,k))[0] for k in range(vocab_length)])
CPU times: user 14.5 s, sys: 200 ms, total: 14.7 s
Wall time: 2.45 s
In [ ]:
print("Largo del vocabulario = {}".format(vocab_length))
Largo del vocabulario = 17777
In [ ]:
def topPalabras(proba_matrix,emoji_id,k=5):
    # retorna las palabras para las cuales el emoji en cuestión tiene mas probabilidad
    prob = proba_matrix[:,emoji_id]  # mmm
    ind = np.argpartition(prob,-k)[-k:]
    val = prob[ind]
    palabras = [vectorizer.inverse_transform([np.eye(1,vocab_length,k)[0]])[0][0] for k in ind]
    return palabras, val
In [ ]:
i = 9
map_emojis = df_us_mapping["label"].values
print(df_us_mapping["emoji"][int(map_emojis[i])])
topPalabras(proba_matrix,i)
🎄
Out[ ]:
(['christmasdecorations',
  'litmas',
  'xmas2016',
  'ohchristmastree',
  'christmastree'],
 array([0.72071568, 0.74812904, 0.82830986, 0.84557572, 0.84607565]))
In [ ]:
for i in range(20):
    print(df_us_mapping["emoji"][int(map_emojis[i])])
    pal, val = topPalabras(proba_matrix,i)
    print(dict([(pal[j],val[j]) for j in range(len(pal))]))
❤
{'sibabes': 0.6906620911053236, '1luv': 0.7080419038362007, 'loveofmylife': 0.7274475019059425, 'cityofbrotherlylove': 0.7704815602436228, 'kfodiaries': 0.8070402343368355}
😍
{'asada': 0.5990262952613353, 'eatgood': 0.5996593037938316, 'gorg': 0.6241783754379878, 'beignet': 0.6351926966791812, 'swooning': 0.6352711964994492}
📷
{'itsamazingoutthere': 0.6389010333752864, 'acmecups': 0.660592709519073, 'gdlfashion': 0.7662663586732787, 'shredforaliving': 0.7800792961284052, 'bvillain': 0.8376467535122063}
🇺🇸
{'govote': 0.8973452218823189, 'flagday': 0.8980841887190848, 'godblessamerica': 0.9121465737229977, 'ivoted': 0.928500356765573, 'merica': 0.9191195807778645}
☀
{'funinthesun': 0.5716137881429628, 'photographer_serena': 0.757053319075366, 'sunshine': 0.6184021102032239, 'beachin': 0.6173871129611473, 'soakin': 0.6944983709464954}
💜
{'purple': 0.5960588343807284, 'ripprince': 0.6271088308339544, 'alzheimer': 0.6412853074207796, 'purplerain': 0.7114808172441944, 'endalz': 0.8001891262720133}
😉
{'phrase': 0.28093525373112144, 'womeninbusiness': 0.3329188840379718, 'wink': 0.3390827215121073, 'backtowork': 0.39444625521706006, 'mividaesunatombola': 0.7018889694726067}
💯
{'childrenofthekorn': 0.7339966923546437, 'eclecticeatery': 0.7339966923546437, 'keepit': 0.7339966923546437, 't3t': 0.7674635519130235, 'rns': 0.7750391313758169}
😁
{'dds': 0.3510216749896364, 'orthodontics': 0.3603248046689727, 'dentistry': 0.3909027792483965, 'braces': 0.40745211042891116, 'djsty': 0.557609751229746}
🎄
{'christmasdecorations': 0.7207156751790977, 'litmas': 0.7481290388684553, 'xmas2016': 0.8283098580938284, 'ohchristmastree': 0.8455757158810366, 'christmastree': 0.8460756453206805}
📸
{'mugshot': 0.42634416193247143, 'mag': 0.45683482365455375, 'opus': 0.6461331201290335, 'bricks': 0.5113096682127451, 'banshee': 0.4884470470525251}
😜
{'burpees': 0.2688657987632991, 'valhalla': 0.2743641853947739, 'wacky': 0.3005927111209562, 'cray': 0.3308708372868768, 'punny': 0.291050306761479}
😂
{'hilarious': 0.8093214442595983, 'lmao': 0.8666822331741617, 'postavideoyoucantexplain': 0.9015484676823673, 'lmaooo': 0.8720264005821722, 'lmfao': 0.8848425032541762}
💕
{'twinny': 0.4472684534700525, 'breastcancerawareness': 0.46099429269204467, 'breast': 0.5126153188390314, 'strides': 0.5650094801000912, 'endorsement': 0.5052248316626653}
🔥
{'onfire': 0.7646572307445548, 'flame': 0.7719807881190203, 'fuego': 0.8230384154889793, 'flames': 0.835786721510818, 'instant_classic': 0.7785377789929019}
😊
{'cab': 0.3247865448520753, '225': 0.34567478478513397, 'trigger': 0.4008625897400202, 'airline': 0.3794104695377874, 'goodtime': 0.3947705031889314}
😎
{'chillen': 0.4975239045721047, 'digg': 0.554659836582496, 'beautique': 0.6002067132711699, 'eyewear': 0.6224398900497997, 'sunglasses': 0.6021041448507571}
✨
{'peterpan': 0.53351226494345, 'pixie': 0.5617734587431917, 'getonshimmur': 0.7415065228535964, 'mmxvi': 0.5757817835382649, 'sparkle': 0.614448801966565}
💙
{'gobigblue': 0.6273970051454849, 'autism': 0.6360408401568516, 'foreverroyal': 0.6785386186263898, 'itsaboy': 0.6837890092234119, 'bbn': 0.652002279044257}
😘
{'besos': 0.3825325387970243, 'kisses': 0.41708922225393286, 'smooch': 0.5412487387547843, 'kissy': 0.5448144222545613, 'smooches': 0.5884696315919746}

Visualización de tokens según Naive Bayes¶

Esta seccion consiste en una visualizacion de los tokens segun la codificacion que nos entrega Naive Bayes. De la seccion anterior, se pueden obtener las probabilidad de que un token pertenezca a una clase dada. En nuestro caso, a un emoji dado. Esto es:

$$ P(w \in C) = \frac{\text{\#(tweets donde $w$ es uno de sus tokens y el tweet tiene el emoji $C$)}}{\text{\#(tweets con el token $w$)}} $$

De esta manera, cada token posee un vector de probabilidades. Donde la $C-$esima componente corresponde a $P(w \in C)$. Es decir,

$$\vec{w} = (P(w \in C) : \text{$C$ es un emoji})$$

En particular, cada vector $\vec{w}$ es uno con tantas coordenadas como emojis (20 en Ingles). Y cada coordenada esta entre 0 y 1. Es decir, cada $\vec{w} \in [0, 1]^{\text{\#Emojis}}$.

Ahora bien, es de nuestro interes visualizar cada token segun su vector de probabilidad. Sin embargo, es necesario reducir la dimensionalidad de cada vector a una facil de interpretar (en nuestro caso 2-dimensiones). Para esto, se utiliza un metodo de reduccion de dimensionalidad denominado UMAP y ampliamente utilizado para la visualizacion de datos en altas dimensiones.

In [ ]:
import umap.umap_ as umap
In [ ]:
%%time
reducer = umap.UMAP(n_neighbors=15)
to_R2 = reducer.fit_transform(proba_matrix)
to_R2.shape
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
CPU times: user 59.4 s, sys: 945 ms, total: 1min
Wall time: 15.8 s
Out[ ]:
(17777, 2)

Luego de reducir los vectores de probabilidad a uno de bi-dimensional, visualizaremos segun dos aspectos el espacio de tokens. Primero, se colorean los vectores segun el emoji con mayor probabilidad. Por ejemplo, si el token $happy$ tiene mayor probabilidad de estar en la clase $smile$, entonces se asocia este token con dicho emoji. La razon de esto es solo para simplificar el analisis. Segundo, existen tokens con probabilidades maximas mas grandes que otras, es decir, tokens asociados a un mismo emoji (segun el criterio anterior) que poseen probabilidades distintas de pertenecer a dicha clase. Para observar esto, se visualizan los token con puntos de diferente tamaño y proporcional a tal probabilidad.

In [ ]:
df_umap = pd.DataFrame(to_R2)
df_umap["token"] = vectorizer.get_feature_names_out()
df_umap["label"] = map_emojis[np.argmax(proba_matrix, axis=1).astype(int)]
df_umap["proba"] = np.max(proba_matrix, axis=1)
df_umap = df_umap.merge(df_us_mapping, on="label", how="left")
df_umap
Out[ ]:
0 1 token label proba emoji name
0 -0.177705 4.321781 00 0 0.149490 ❤ _red_heart_
1 -1.404681 3.566153 000 0 0.138475 ❤ _red_heart_
2 -0.371582 0.889135 007 0 0.243937 ❤ _red_heart_
3 1.903484 1.820665 01 0 0.231225 ❤ _red_heart_
4 1.033077 2.755328 02 0 0.187876 ❤ _red_heart_
... ... ... ... ... ... ... ...
17772 0.347587 0.319676 zumba 0 0.241695 ❤ _red_heart_
17773 -0.164445 3.247977 zumbafitness 14 0.140855 😉 _winking_face_
17774 0.608489 -1.054816 δδδ 0 0.241686 ❤ _red_heart_
17775 -0.678625 -0.585214 σκ 13 0.278696 💜 _purple_heart_
17776 -1.265634 -0.655227 σσς 13 0.440488 💜 _purple_heart_

17777 rows × 7 columns

In [ ]:
data = []
for label in df_us_mapping["label"]:
    sub_df = df_umap[df_umap["label"] == label]
    data.append(
        go.Scattergl(
            x = sub_df[0],
            y = sub_df[1],
            mode='markers',
            text=sub_df["token"]+"<br>"+sub_df["emoji"]+"<br>"+sub_df["proba"].apply(lambda x: str(np.round(x, 3))),
            name=sub_df["emoji"].iloc[0],
            marker=dict(
                size=25*sub_df["proba"],
                line_width=0.2,
            )
        )
    )
    
fig = go.Figure(data=data)
fig.update_layout(
    title="Proyección (UMAP) de vectores de probabilidad de tokens",
    autosize=False,
    width=700,
    height=500,
)
fig.show(renderer="notebook")

Comentarios El top 5 de la seccion anterior se puede capturar con los primero cinco punto de mayor tamaño para un emoji dado. Tambien, se observa que la clase con mas puntos corresponde al emoji del corazon. Mismo emoji con mayor popularidad visto en la etapa de analisis de los datos. Se observan grupos diferenciados, pero que logran solaparse. Esta zona coincide con aquellos tokens con probabilidades uniformes de pertenecer a cada clase y/o con probabilidad maxima cercanas a 0.1.

Distribución de clases usando subsampling

In [ ]:
df_us_train1 = pickle.load(open(file_names["df_us_train"], "rb"))
In [ ]:
print("Distribucion de clases original")
counts = df_us_train1['label'].value_counts()
counts
Distribucion de clases original
Out[ ]:
0     83611
1     40934
2     40396
4     20042
3     19991
5     18493
6     17127
7     13890
10    13035
9     12671
8     12662
11    11758
15    10734
18    10721
14    10689
12    10515
16    10474
17     9969
13     9898
19     9682
Name: label, dtype: int64
In [ ]:
min_freq = np.min(counts.values)
min_class = list(counts.index)[np.argmin(counts.values)]
print("Mínima frecuencia entre las distintas clases = {}\nEmoji class = {}".format(min_class,min_class))
Mínima frecuencia entre las distintas clases = 19
Emoji class = 19
In [ ]:
reduce_index = list(counts.index)
reduce_index.remove(min_class)
In [ ]:
for label in reduce_index:
    freq = counts[label]
    delete_counts = freq - min_freq
    df_us_train1 = df_us_train1.reset_index(drop=True)
    ## subsampling sobre la clase label
    idx = np.random.choice(df_us_train1.loc[df_us_train1.label == label].index, size=delete_counts, replace=False)
    data_subsampled = df_us_train1.drop(df_us_train1.iloc[idx].index, inplace = True)
In [ ]:
%%time
df_us_train1['tokenized_text'] = df_us_train1['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
df_us_train1.head()
CPU times: user 9.02 s, sys: 15.9 ms, total: 9.04 s
Wall time: 9.1 s
Out[ ]:
id text label tokenized_text
0 729044324441186304 Selfies for summatime @ Drexel University 12 selfies for summatime @ drexel university
1 663834134037442560 Ready to be a bulldog with rasso #hailstate #i... 14 ready to be a bulldog with rasso #hailstate #i...
2 747449193350963200 #scored my new #matcotools #slidehammer weight... 16 #scored my new #matcotools #slidehammer weight...
3 758118895618109440 love beach days @ Manasquan Beach 12 love beach days @ manasquan beach
4 689978401587892224 With COCO @ New York, New York 16 with coco @ new york , new york
In [ ]:
df_us_test['tokenized_text'] = df_us_test['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
In [ ]:
vectorizer = CountVectorizer(min_df=10)
X_train_bow1 = vectorizer.fit_transform(df_us_train1["tokenized_text"])
X_test_bow = vectorizer.transform(df_us_test["tokenized_text"])
clf = MultinomialNB(alpha = .2)
clf.fit(X_train_bow1, df_us_train1["label"])

clf.score(X_train_bow1, df_us_train1["label"])
Out[ ]:
0.37795909935963645
In [ ]:
df_us_mapping = pickle.load(open(file_names["df_us_mapping"], "rb")).sort_values("label")

y_pred = clf.predict(X_test_bow)
print(classification_report(df_us_test["label"], y_pred, target_names=df_us_mapping["emoji"]))
              precision    recall  f1-score   support

           ❤       0.45      0.10      0.16     10798
           😍       0.26      0.14      0.19      4830
           📷       0.13      0.20      0.16      1432
          🇺🇸       0.34      0.56      0.42      1949
           ☀       0.18      0.56      0.27      1265
           💜       0.09      0.12      0.10      1114
           😉       0.07      0.13      0.09      1306
           💯       0.15      0.30      0.20      1244
           😁       0.07      0.11      0.09      1153
           🎄       0.44      0.70      0.54      1545
           📸       0.21      0.21      0.21      2417
           😜       0.05      0.08      0.06      1010
           😂       0.39      0.28      0.32      4534
           💕       0.14      0.17      0.15      2605
           🔥       0.46      0.41      0.44      3716
           😊       0.08      0.10      0.09      1613
           😎       0.13      0.11      0.12      1996
           ✨       0.22      0.24      0.23      2749
           💙       0.11      0.15      0.13      1549
           😘       0.08      0.21      0.12      1175

    accuracy                           0.22     50000
   macro avg       0.20      0.24      0.21     50000
weighted avg       0.28      0.22      0.22     50000

Conclusiones

A partir del subsampling, se puede observar que los resultados no presentan una mejorar e incluso en varias clases disminuye la efectividad del clasificador, por lo que es mejor no hacer el subsampling para obtener mejores metricas.